import pandas as pd
import altair as alt
jobs_url = "https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json"
jobs = pd.read_json(jobs_url)Homework 5
Your Turn: Wide-long and long-wide
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(jobs).transform_pivot(
'year',
groupby=['job', 'sex'],
value='perc'
).mark_point().encode(
x='1950:Q',
y='2000:Q',
color='sex:N',
tooltip=['sex', '1950:Q', '2000:Q', 'job']
).properties(title="Percentage of men and women in various fields: 1950 vs 2000")Above is my plot showing percentage of men/women in different jobs in 1950 and 2000. One thing I would’ve liked to add here is logarithmic axes, but I was having issues when doing so.
from altair import datum
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "job:N"
).facet(row = "sex:N"
)The plot above shows the percentage of men and women working different jobs over time, using every job in the dataset. This is obviously cluttered…
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "job:N"
).facet(row = "sex:N"
).transform_filter(
(datum.job == "Accountant / Auditor") | (datum.job == "Carpenter")
)This plot shows only accountants/auditors and carpenters, which simplifies the graph significantly, although it does show less info.
alt.Chart(jobs).mark_line().encode(
x = "year:O",
y = "perc:Q",
color = "sex:N"
).transform_filter(
(datum.job == "Accountant / Auditor")
)This graph shows only the allocation for accountant/auditors (my future career) over time, using color to differentiate sexes.
Your turn: Maps
gapminder_url = "https://cdn.jsdelivr.net/npm/vega-datasets@1.29.0/data/gapminder.json"
gapminder = pd.read_json(gapminder_url)
print(gapminder[0:4]) year country cluster pop life_expect fertility
0 1955 Afghanistan 0 8891209 30.332 7.7
1 1960 Afghanistan 0 9829450 31.997 7.7
2 1965 Afghanistan 0 10997885 34.020 7.7
3 1970 Afghanistan 0 12430623 36.088 7.7
import json
from urllib.request import urlopen
world2_url = 'https://cdn.jsdelivr.net/npm/world-atlas@2/countries-110m.json'
world2 = json.load(urlopen(world2_url))
country_names = [ p['properties']['name']
for p in world2['objects']['countries']['geometries'] ]
common_names = list(set(country_names) & set(gapminder['country']))
missing_names = list(set(gapminder['country']) - set(country_names))
extra_names = list(set(country_names) - set(gapminder['country']))
# names in gapminder and in map data
print("in common:", len(common_names), common_names)
# names in gapminder but not in map data
print("missing in map:", len(missing_names), missing_names)
# names in the map data but not in gapminder
print("extra in map", len(extra_names), extra_names)in common: 57 ['New Zealand', 'Chile', 'Portugal', 'Japan', 'Bahamas', 'South Africa', 'Peru', 'India', 'Canada', 'Ecuador', 'Egypt', 'Iceland', 'China', 'Pakistan', 'Saudi Arabia', 'Jamaica', 'Rwanda', 'Philippines', 'Bolivia', 'Israel', 'Finland', 'France', 'Indonesia', 'Ireland', 'Nigeria', 'Georgia', 'Lebanon', 'Bangladesh', 'North Korea', 'Afghanistan', 'Brazil', 'Norway', 'Cuba', 'Kenya', 'Turkey', 'Switzerland', 'Mexico', 'Australia', 'Spain', 'Poland', 'Italy', 'South Korea', 'Austria', 'Colombia', 'Croatia', 'Venezuela', 'Iraq', 'El Salvador', 'Greece', 'Belgium', 'Germany', 'Iran', 'Costa Rica', 'Argentina', 'Haiti', 'Netherlands', 'United Kingdom']
missing in map: 6 ['Barbados', 'United States', 'Aruba', 'Hong Kong', 'Grenada', 'Dominican Republic']
extra in map 120 ['Taiwan', 'Tanzania', "Côte d'Ivoire", 'Kyrgyzstan', 'Moldova', 'Lesotho', 'Malawi', 'Myanmar', 'Eritrea', 'Timor-Leste', 'Romania', 'Honduras', 'Turkmenistan', 'Slovakia', 'Togo', 'Jordan', 'Zimbabwe', 'Mauritania', 'Antarctica', 'W. Sahara', 'Tunisia', 'Luxembourg', 'Yemen', 'Madagascar', 'Latvia', 'Slovenia', 'Eq. Guinea', 'Fiji', 'Mali', 'Thailand', 'Armenia', 'Albania', 'Dominican Rep.', 'Czechia', 'Algeria', 'Qatar', 'Sudan', 'Lithuania', 'Fr. S. Antarctic Lands', 'Vietnam', 'Guatemala', 'Mongolia', 'Serbia', 'Hungary', 'Oman', 'Tajikistan', 'Kazakhstan', 'Benin', 'eSwatini', 'Guinea', 'United Arab Emirates', 'Laos', 'Macedonia', 'Syria', 'Niger', 'Solomon Is.', 'Libya', 'United States of America', 'Namibia', 'N. Cyprus', 'Falkland Is.', 'Montenegro', 'Sweden', 'Uruguay', 'Morocco', 'Angola', 'Congo', 'Ethiopia', 'Cambodia', 'Chad', 'Guyana', 'Zambia', 'Belarus', 'Mozambique', 'Trinidad and Tobago', 'Malaysia', 'S. Sudan', 'Gabon', 'Panama', 'Gambia', 'Ukraine', 'Kuwait', 'Denmark', 'Somaliland', 'Bulgaria', 'Senegal', 'Ghana', 'Nepal', 'Burkina Faso', 'Liberia', 'Brunei', 'New Caledonia', 'Burundi', 'Somalia', 'Uzbekistan', 'Greenland', 'Central African Rep.', 'Vanuatu', 'Papua New Guinea', 'Nicaragua', 'Estonia', 'Bhutan', 'Cyprus', 'Uganda', 'Sierra Leone', 'Russia', 'Palestine', 'Cameroon', 'Belize', 'Botswana', 'Djibouti', 'Sri Lanka', 'Azerbaijan', 'Paraguay', 'Guinea-Bissau', 'Dem. Rep. Congo', 'Suriname', 'Bosnia and Herz.', 'Puerto Rico', 'Kosovo']
world = alt.topo_feature(world2_url, feature = 'countries')
world_map = alt.Chart(world).mark_geoshape(
).properties(
width=800,
height=400
).transform_lookup(
lookup="properties.name",
from_= alt.LookupData(gapminder, 'country', ['life_expect', 'year'])
).encode(
fill='life_expect:Q'
)
world_mapNote: I’m not sure why there’s no legend showing up.
from vega_datasets import data
airports = data.airports()
states = alt.topo_feature(data.us_10m.url, feature = 'states')
state_map = alt.Chart(states).mark_geoshape(
fill = 'transparent',
stroke = 'steelblue'
).project('albersUsa'
).transform_lookup(
lookup = 'state',
from_ = alt.LookupData(airports, 'state', ['name'])
).encode(
fill = 'count()'
)
state_map.properties(width = 500, height = 300)For some reason the ‘fill’ and the projection is not working. It’s probably an issue with the lookup, but I haven’t been able to identify it.
Visualization galleries
I saw a graphic on the Reddit page of NFL receiving yards leaders in 2023, but adjusted for Pass Interference calls. The story here is that if DPI yards were included in total yards, the leaderboard changes dramatically. It’ a good graphic because it makes use of so many features. It has bars for each players actual receiving yards, another bar for DPI yards (which is a different color), text showing the actual number of actual, DPI, and adjusted total yards, and arrows showing the shift in ranking due to the adjustment. I know how to implement the stacked bars, text, x and y axes labels, and the colors. I don’t know how to implement the arrows showing the change in ranking on the leaderboard.